今天我們就來試試在 PHPUnit 裡寫爬蟲,我們就來爬PTT的首頁吧
我們可以很快速的建立第一個測試案例,寫完測試案例後立刻執行一次測試得到紅燈
<?php
// tests/PttCrawlerTest.php
namespace Recca0120\Ithome30\Tests;
use PHPUnit\Framework\TestCase;
class PttCrawlerTest extends TestCase
{
public function test_crawl_home()
{
$crawler = new PttCrawler();
$records = $crawler->all();
self::assertEquals([
'name' => 'Gossiping',
'nuser' => '13483',
'class' => '綜合',
'title' => '[八卦]不停重複今日公祭明日忘記'
], $records[0]);
}
}
先寫以下的程式碼
<?php
// src/PttCrawler.php
namespace Recca0120\Ithome30;
class PttCrawler
{
public function all()
{
$html = file_get_contents('https://www.ptt.cc/bbs/hotboards.html');
var_dump($html);
}
}
再執行測試在 Ouput 裡會得到以下結果(測試依然是紅燈)
🚀 PHPUnit 10.3.4 by Sebastian Bergmann and contributors.
Runtime: PHP 8.1.23
Configuration: /Users/recca0120/Sites/ithome-30/phpunit.xml
Recca0120\Ithome30\Tests\PttCrawlerTest
❌ crawl_home 789 ms
┐
├ null does not match expected type "array".
├
├ This test printed output: string(54319) "<!DOCTYPE html>
├ <html>
├ <head>
├ <meta charset="utf-8">
├
├
├ <meta name="viewport" content="width=device-width, initial-scale=1">
├
├ <title>熱門看板 - 批踢踢實業坊</title>
├
├ <link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">
├ <link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">
├ <link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">
├ <link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">
├ <link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">
├
├
├
├
├ </head>
├ <body>
....
這樣我們就可以先確定程式能正常抓到原始的 HTML 了,接著我們可以先把程式碼改為
<?php
// src/PttCrawler.php
namespace Recca0120\Ithome30;
class PttCrawler
{
public function all()
{
var_dump($this->fetch());
}
// modifier 必須為 protected
protected function fetch()
{
return file_get_contents('https://www.ptt.cc/bbs/hotboards.html');
}
}
再執行測試在 Ouput 裡應該要得到和之前相同的結果(測試依然是紅燈),在進行下一步之前我們先把 PTT 的 HTML,先另存到 tests/fixtures/ptt_home.html 裡,至於為什麼要這樣修改接下來說再說明
<?php
// tests/PttCrawlerTest.php
namespace Recca0120\Ithome30\Tests;
use PHPUnit\Framework\TestCase;
class PttCrawlerTest extends TestCase
{
public function test_crawl_home()
{
$crawler = new StubPttCrawler();
$records = $crawler->all();
self::assertEquals([
'name' => 'Gossiping',
'nuser' => '13483',
'class' => '綜合',
'title' => '[八卦]不停重複今日公祭明日忘記'
], $records[0]);
}
}
class StubPttCrawler extends PttCrawler
{
protected function fetch()
{
return file_get_contents(__DIR__.'/fixtures/ptt_home.html');
}
}
這時我們就可以利用 class 的繼承來修改 fetch method,所以直接在測試案例裡建一個新的 StubPttCrawler, 讓 StubPttCrawler 繼承 PttCrawler 並且覆寫 fetch,讓 fetch 直接回傳我們剛剛另存下來的 ptt_home.html,為什麼要這樣做呢?有以下好處
接著我們就可以利用快速鍵重新執行測試的方式來進行開發,會先獲得這樣的程式碼
<?php
// src/PttCrawler.php
namespace Recca0120\Ithome30;
class PttCrawler
{
public function all()
{
$html = $this->fetch();
preg_match_all('/<div class="b-ent">\s+<a[^>]+>.*?<\/a>\s+<\/div>/s', $html, $matches);
$rows = $matches[0];
$records = [];
foreach ($rows as $row) {
preg_match_all('/<div\sclass=\"board-(?<name>[\w]+)\">(?<value>.+)<\/div[^>]*>/', $row, $matches);
$temp = [];
foreach (array_keys($matches[0]) as $index) {
$temp[$matches['name'][$index]] = str_replace('◎', '', strip_tags($matches['value'][$index]));
}
$records[] = $temp;
}
return $records;
}
protected function fetch()
{
return file_get_contents('https://www.ptt.cc/bbs/hotboards.html');
}
}
我們再接著把程式碼重構成這個樣子
<?php
namespace Recca0120\Ithome30;
class PttCrawler
{
public function all()
{
$html = $this->fetch();
$rows = $this->parseRows($html);
$records = [];
foreach ($rows as $row) {
$records[] = $this->parseCols($row);
}
return $records;
}
private function parseRows(string $html)
{
preg_match_all('/<div class="b-ent">\s+<a[^>]+>.*?<\/a>\s+<\/div>/s', $html, $matches);
return $matches[0];
}
private function parseCols(string $html)
{
preg_match_all('/<div\sclass=\"board-(?<name>[\w]+)\">(?<value>.+)<\/div[^>]*>/', $html, $matches);
$cols = [];
foreach (array_keys($matches[0]) as $index) {
$cols[$matches['name'][$index]] = str_replace('◎', '', strip_tags($matches['value'][$index]));
}
return $cols;
}
protected function fetch()
{
return file_get_contents('https://www.ptt.cc/bbs/hotboards.html');
}
}
覺得還不夠滿意,我們可以再重構成這個樣子
<?php
namespace Recca0120\Ithome30;
class PttCrawler
{
public function all()
{
return array_map(fn ($row) => $this->parseCols($row), $this->parseRows($this->fetch()));
}
private function parseRows(string $html)
{
preg_match_all('/<div class="b-ent">\s+<a[^>]+>.*?<\/a>\s+<\/div>/s', $html, $matches);
return $matches[0];
}
private function parseCols(string $html)
{
preg_match_all('/<div\sclass=\"board-(?<name>[\w]+)\">(?<value>.+)<\/div[^>]*>/', $html, $matches);
$cols = [];
foreach (array_keys($matches[0]) as $index) {
$cols[$matches['name'][$index]] = str_replace('◎', '', strip_tags($matches['value'][$index]));
}
return $cols;
}
protected function fetch()
{
return file_get_contents('https://www.ptt.cc/bbs/hotboards.html');
}
}
這樣是不是清爽多了